package cn.ihoway.analysis;

import cn.ihoway.entity.Dict;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.Stack;

public class SentAnalyzer {

    private static final String IS_NUM_REGEX = "^\\d+(\\.\\d+)?$"; //是否全是数字
    private static final String IS_WORD_REGEX = "^[a-zA-Z]+$"; //是否全是字母

    /**
     * 句子分割分析 切词
     * @param sentence 句子
     * @param index 该句子在文档中的初始位置
     * @return analyze,具体单词在句子中的位置，可能存在多个，如：领域 [12:14, 19:21]，其中$cnt [27]，27表示该句子存在27个词语
     */
    public HashMap<String, ArrayList<String>> sentAnalyze(String sentence,int index){
        Stack<HashMap<String,Integer>> result = sentSeg(sentence);
        HashMap<String, ArrayList<String>> analyze = new HashMap<>();
        int cnt = 0;
        while(!result.isEmpty()) {
            HashMap<String,Integer> word = result.pop();
            String wordStr = "";
            int value = 0;
            for(String key:word.keySet()) {
                wordStr = key;
                value = word.get(key) + index;
            }
            Integer begin = value - wordStr.length();
            String pos = begin + ":" + value;
            ArrayList<String> list;
            if(analyze.containsKey(wordStr)){
                list = analyze.get(wordStr);
            }else {
                list = new ArrayList<>();
            }
            list.add(pos);
            analyze.put(wordStr,list);
            cnt++;
        }
        ArrayList<String> cntArray = new ArrayList<>();
        cntArray.add(String.valueOf(cnt));
        analyze.put("$cnt",cntArray); //词语总数
        return analyze;
    }

    /**
     * 句子分割
     * @param sentence 句子
     * @return 按不同算法返回分割结果，目前仅逆向最大匹配
     */
    public Stack<HashMap<String,Integer>> sentSeg(String sentence) {
        sentence = sentence.toLowerCase(); //不区分大小写，所以全部转为小写
        int maxL = 20; //定义词语最长长度为20，英语单词比较长，所以设置的比较大
        return reverseMaxMatch(sentence, maxL);
    }

    /**
     * 逆向最大匹配
     * @param sentence 句子
     * @param maxL 词语最长
     * @return Stack<HashMap<String,Integer>> result
     */
    private Stack<HashMap<String,Integer>> reverseMaxMatch(String sentence, int maxL) {
        Stack<HashMap<String,Integer>> result = new Stack<>();
        for(int j = sentence.length(); j > 0;) {
            HashMap<String,Integer> word = new HashMap<>();
            if(j < maxL) { //小于MAX两种情况:1.本身小于 2.减到最后小于
                maxL = j; //更改最长词语长度为j
            }
            String subWord = sentence.substring(j - maxL, j);
            if(isWordMatch(subWord)) { //subWord是否匹配? 单字都匹配
                word.put(subWord,j);
                result.push(word);
                j -= maxL;
            }else {
                String tempWord = "";
                int k;
                for(k = maxL - 1; k > 0; k--) {
                    tempWord = subWord.substring(maxL - k, maxL);
                    if(isWordMatch(tempWord)) {
                        break; //tempWOrd匹配就跳出循环，全部单字均匹配
                    }
                }
                if(k == 0) k = 1;
                word.put(tempWord,j);
                result.push(word);
                j = j - k;
            }
        }
        return result;
    }

    /**
     * 词语是否匹配正确
     * 词典存在的
     * 长度为1的
     * 仅有英文
     * 仅有数字
     * @param word 被匹配词语
     */
    private boolean isWordMatch(String word){
        return Dict.contain(word) || word.length() == 1 || word.matches(IS_WORD_REGEX) || word.matches(IS_NUM_REGEX);
    }

}
